Hyperparameter Optimizer¶

References

  • Optuna simple example https://github.com/optuna/optuna-examples/blob/main/pytorch/pytorch_simple.py#L71
  • Optuna RL example https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py#L114
  • Hugging Face policy gradient https://huggingface.co/learn/deep-rl-course/unit4/hands-on
In [ ]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install tensorflow
!pip install numpy
!pip install torch
!pip install -U scikit-learn
!pip install optuna
!pip install stable-baselines3[extra]
!pip install cmaes
In [1]:
import os
import gymnasium as gym
from collections import deque
from typing import Any
from typing import Dict
from tqdm import tqdm

import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import optuna
from optuna.trial import TrialState

# Load environment
env = gym.make("ALE/MsPacman-ram-v5")
In [2]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(50);
In [3]:
default_hyperparams = {
    "epoch": 20,  # max number of episodes per optimization trial
#     "n_training_episodes": 5000,  TODO: Delete
    "max_t": 50000,  # max number of steps per trial
#     "env_id": "ALE/MsPacman-ram-v5",  TODO: Delete
    "state_space": 128,  # RAM data for Atari console during game
    "action_space": 5,  # No-op, up, right, left, down
}
In [4]:
# Based off Optuna RL example code
# Changes by CS 175 project group: hyperparameters being sampled
def sample_hyperparams(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for hyperparameters."""
    gamma = trial.suggest_float("gamma", 0.99995, 1, log=True)
    n_layers = trial.suggest_int("n_layers", 1, 5)
    h_size = trial.suggest_int("h_size", 4, 1024)
    dropout = trial.suggest_float("dropout", 0.0, 0.7, log=False)
    lr = trial.suggest_float("lr", 1e-6, 1e-2, log=True)
    longevity_exponential = trial.suggest_float("longevity_exponential", 1.001, 1.01, log=True)
    step_penalty_multiplier = trial.suggest_float("step_penalty_multiplier", 1, 1.1, log=True)
#     ghost_reward = trial.suggest_int("ghost_reward", -1000, 1000)
    ghost_reward = 0
    dot_extra_reward = trial.suggest_int("dot_extra_reward", 0, 20)
    energy_pill_extra_reward = trial.suggest_int("energy_pill_extra_reward", 0, 100)
    # optimizer = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    optimizer = "SGD"

    # Display true values.
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_layers_", n_layers)
    trial.set_user_attr("h_size_", h_size)
    trial.set_user_attr("dropout_", dropout)
    trial.set_user_attr("lr_", lr)
    trial.set_user_attr("longevity_exponential_", longevity_exponential)
    trial.set_user_attr("step_penalty_multiplier_", step_penalty_multiplier)
    trial.set_user_attr("ghost_reward_", ghost_reward)
    trial.set_user_attr("dot_extra_reward_", dot_extra_reward)
    trial.set_user_attr("energy_pill_extra_reward_", energy_pill_extra_reward)
    trial.set_user_attr("optimizer_", optimizer)

    return {
        "gamma": gamma,
        "n_layers": n_layers,
        "h_size": h_size,
        "dropout": dropout,
        "lr": lr,
        "longevity_exponential": longevity_exponential,
        "step_penalty_multiplier": step_penalty_multiplier,
        "ghost_reward": ghost_reward,
        "dot_extra_reward": dot_extra_reward,
        "energy_pill_extra_reward": energy_pill_extra_reward,
        "optimizer": optimizer
    }
In [5]:
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - class inherits nn.Sequential rather than nn.Module
#   - change to constructor method and deletion of explicitly defined forward method
class Policy(nn.Sequential):
  def __init__(self, n_layers, h_size, dropout, s_size, a_size):
    layers = []

    in_features = s_size
    for i in range(n_layers):
      layers.append(nn.Linear(in_features, h_size))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout))

      in_features = h_size
    layers.append(nn.Linear(in_features, a_size))
    layers.append(nn.LogSoftmax(dim=1))

    super().__init__(*layers)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)
In [6]:
# Contains policy trainer from Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - changes to reward for training
#   - ensure changes to reward doesn't affect score output
#   - added Optuna methods to evaluate episodes and prune trials if needed
#   - cut out portions from original code not needed by trainer
def train(trial, policy, optimizer, epoch, max_t, gamma, ghost_reward, step_penalty_multiplier, 
          longevity_exponential=0, dot_extra_reward=0, energy_pill_extra_reward=0):
    for i_epoch in range(epoch + 1):
        saved_log_probs = []
        rewards = []
        state,game_env = env.reset()
        
        # Variables for reward changes
        step_num = 0
        score_adjustments = 0
        rewards_this_life = 0
        step_penalty = 1
        cur_step_penalty = step_penalty

        for t in range(max_t):
            old_game_env = game_env

            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _, game_env = env.step(action)
            
            # Longevity reward. More reward gathered for each life, larger reward
            if old_game_env["lives"] > game_env["lives"]:
                longevity_reward = longevity_exponential ** rewards_this_life
                rewards_this_life = 0
                reward += longevity_reward
                score_adjustments -= longevity_reward
                rewards.append(reward)
                continue
                
            reward_change = 0
            
            # Equal penalty for eating ghost
            if reward // 100 == 2:
              reward_change = reward - 200 + ghost_reward
              score_adjustments += 200 - ghost_reward
            elif reward // 100 == 4:
              reward_change = reward - 400 + ghost_reward
              score_adjustments += 400 - ghost_reward
            elif reward // 100 == 8:
              reward_change = reward - 800 + ghost_reward
              score_adjustments += 800 - ghost_reward
            elif reward // 100 == 16:
              reward_change = reward - 1600 + ghost_reward
              score_adjustments += 1600 - ghost_reward
                
            # Penalty for going many steps without eating dot
            if reward % 100 == 10:
                cur_step_penalty = step_penalty
                reward_change += dot_extra_reward
                score_adjustments -= dot_extra_reward
            elif reward % 100 == 50:
                cur_step_penalty = step_penalty
                reward_change += energy_pill_extra_reward
                score_adjustments -= energy_pill_extra_reward
            else:
                cur_step_penalty *= step_penalty_multiplier
                reward_change -= step_penalty
                score_adjustments += step_penalty
            
            rewards.append(reward + reward_change)

            if done:
                break

        final_score = sum(rewards) + score_adjustments

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        ## We compute this starting from the last timestep to the first, to avoid redundant computations

        ## appendleft() function of queues appends to the position 0
        ## We use deque instead of lists to reduce the time complexity

        for t in range(n_steps)[::-1]:
          disc_return_t = rewards[t] + gamma * (disc_return_t if t + 1 < n_steps else 0)
          returns.appendleft(disc_return_t)

        ## standardization for training stability
        eps = np.finfo(np.float32).eps.item()

        ## eps is added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        trial.report(final_score, i_epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return final_score
In [7]:
# Based off Optuna simple example code
# Changes by CS 175 project group: 
#   - replaced original policy with policy for Ms Pacman
#   - consolidated training code into separate function (previous code box)
def objective(trial):
    hyperparameters = {**default_hyperparams, **sample_hyperparams(trial)}

    # Generate the model.
    policy = Policy(hyperparameters["n_layers"], hyperparameters["h_size"],
                    hyperparameters["dropout"], hyperparameters["state_space"],
                    hyperparameters["action_space"]).to(device)

    # Generate the optimizers.
    optimizer_name = hyperparameters["optimizer"]
    optimizer = getattr(optim, optimizer_name)(policy.parameters(), lr=hyperparameters["lr"])

    score = train(trial, policy, optimizer, epoch=hyperparameters["epoch"],
                  max_t=hyperparameters["max_t"], gamma=hyperparameters["gamma"],
                  ghost_reward=hyperparameters["ghost_reward"],
                  step_penalty_multiplier=hyperparameters["step_penalty_multiplier"],
                  longevity_exponential=hyperparameters["longevity_exponential"],
                  dot_extra_reward=hyperparameters["dot_extra_reward"],
                  energy_pill_extra_reward=hyperparameters["energy_pill_extra_reward"],
                 )

    return score
In [10]:
# Create an Optuna study
# Study info will be saved at path given to "storage" parameter as .db file
study = optuna.create_study(study_name="MsPacMan_study",  
                            direction="maximize", 
                            # Recommend default sampler and pruner for <1000 trials
                            # Comment out following two lines to use default sampler and pruner
#                             sampler=optuna.samplers.CmaEsSampler(consider_pruned_trials=False), 
#                             pruner=optuna.pruners.HyperbandPruner()
                           )
[I 2023-12-11 11:12:15,663] A new study created in memory with name: MsPacMan_study
In [ ]:
# Load saved study
study = optuna.load_study(study_name="MsPacMan_study", storage="sqlite:///Studies/MsPacMan_study.db")
In [11]:
# Start Optuna study
# show_progress_bar=True wouldn't work on Jupyter Notebook without installing Google Colab package
# n_jobs: number of parallel jobs
study.optimize(objective, n_trials=50, timeout=None, n_jobs=1, gc_after_trial=True, show_progress_bar=False)
[I 2023-12-11 11:12:43,250] Trial 0 finished with value: 230.0 and parameters: {'gamma': 0.9999692031511148, 'n_layers': 4, 'h_size': 256, 'dropout': 0.1762122397011413, 'lr': 0.0002502409509623826, 'longevity_exponential': 1.0047816572131334, 'step_penalty_multiplier': 1.0779003102967768, 'dot_extra_reward': 20, 'energy_pill_extra_reward': 57}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:13:02,236] Trial 1 finished with value: 200.0 and parameters: {'gamma': 0.9999753440414736, 'n_layers': 3, 'h_size': 412, 'dropout': 0.6145633028030688, 'lr': 0.00020370932422462964, 'longevity_exponential': 1.0057163897294066, 'step_penalty_multiplier': 1.0529321019556988, 'dot_extra_reward': 20, 'energy_pill_extra_reward': 76}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:13:24,555] Trial 2 finished with value: 200.0 and parameters: {'gamma': 0.9999722852096964, 'n_layers': 4, 'h_size': 372, 'dropout': 0.40882190345708047, 'lr': 3.385352126959437e-05, 'longevity_exponential': 1.0077938949853686, 'step_penalty_multiplier': 1.0899462254339654, 'dot_extra_reward': 7, 'energy_pill_extra_reward': 20}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:18,432] Trial 3 finished with value: 220.0 and parameters: {'gamma': 0.9999859381666313, 'n_layers': 3, 'h_size': 967, 'dropout': 0.05000946918026199, 'lr': 3.644208074592394e-05, 'longevity_exponential': 1.009943284717741, 'step_penalty_multiplier': 1.0037587742205414, 'dot_extra_reward': 10, 'energy_pill_extra_reward': 66}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:29,042] Trial 4 finished with value: 200.0 and parameters: {'gamma': 0.9999771592838165, 'n_layers': 2, 'h_size': 16, 'dropout': 0.6027795147012084, 'lr': 1.7620923530872803e-05, 'longevity_exponential': 1.0036798711668844, 'step_penalty_multiplier': 1.0278501161835476, 'dot_extra_reward': 19, 'energy_pill_extra_reward': 60}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:50,764] Trial 5 finished with value: 170.0 and parameters: {'gamma': 0.9999892535357757, 'n_layers': 4, 'h_size': 415, 'dropout': 0.28736803249840215, 'lr': 0.0036511920726623607, 'longevity_exponential': 1.006319323210271, 'step_penalty_multiplier': 1.0017418504252942, 'dot_extra_reward': 5, 'energy_pill_extra_reward': 6}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:15:40,088] Trial 6 finished with value: 290.0 and parameters: {'gamma': 0.9999861632147989, 'n_layers': 3, 'h_size': 830, 'dropout': 0.29617055176534335, 'lr': 0.0004917749348627604, 'longevity_exponential': 1.0033305391643417, 'step_penalty_multiplier': 1.0285037869184064, 'dot_extra_reward': 15, 'energy_pill_extra_reward': 64}. Best is trial 6 with value: 290.0.
[I 2023-12-11 11:15:40,942] Trial 7 pruned. 
[I 2023-12-11 11:15:41,921] Trial 8 pruned. 
[I 2023-12-11 11:15:42,799] Trial 9 pruned. 
[I 2023-12-11 11:15:46,224] Trial 10 pruned. 
[I 2023-12-11 11:15:49,263] Trial 11 pruned. 
[I 2023-12-11 11:15:50,349] Trial 12 pruned. 
[I 2023-12-11 11:15:51,475] Trial 13 pruned. 
[I 2023-12-11 11:15:52,729] Trial 14 pruned. 
[I 2023-12-11 11:16:12,176] Trial 15 finished with value: 250.0 and parameters: {'gamma': 0.9999808234768816, 'n_layers': 4, 'h_size': 154, 'dropout': 0.12090439946790885, 'lr': 0.00017369346396499275, 'longevity_exponential': 1.0035306163227822, 'step_penalty_multiplier': 1.0424610615800955, 'dot_extra_reward': 18, 'energy_pill_extra_reward': 58}. Best is trial 6 with value: 290.0.
[I 2023-12-11 11:16:13,601] Trial 16 pruned. 
[I 2023-12-11 11:16:14,404] Trial 17 pruned. 
[I 2023-12-11 11:16:16,870] Trial 18 pruned. 
[I 2023-12-11 11:16:34,830] Trial 19 finished with value: 310.0 and parameters: {'gamma': 0.9999926127055915, 'n_layers': 3, 'h_size': 178, 'dropout': 0.0069241572543159435, 'lr': 0.0001014520595303218, 'longevity_exponential': 1.002464702839554, 'step_penalty_multiplier': 1.0457572624287927, 'dot_extra_reward': 11, 'energy_pill_extra_reward': 93}. Best is trial 19 with value: 310.0.
[I 2023-12-11 11:16:36,392] Trial 20 pruned. 
[I 2023-12-11 11:16:57,782] Trial 21 finished with value: 650.0 and parameters: {'gamma': 0.9999931906352822, 'n_layers': 3, 'h_size': 169, 'dropout': 0.006649481779098021, 'lr': 0.00010041720505743183, 'longevity_exponential': 1.0030306466772165, 'step_penalty_multiplier': 1.0403995699975574, 'dot_extra_reward': 9, 'energy_pill_extra_reward': 73}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:16:58,512] Trial 22 pruned. 
[I 2023-12-11 11:16:59,421] Trial 23 pruned. 
[I 2023-12-11 11:17:00,375] Trial 24 pruned. 
[I 2023-12-11 11:17:18,804] Trial 25 finished with value: 210.0 and parameters: {'gamma': 0.9999886717895855, 'n_layers': 3, 'h_size': 237, 'dropout': 0.21330213401198192, 'lr': 1.1509465567672235e-05, 'longevity_exponential': 1.001035991088817, 'step_penalty_multiplier': 1.0416126981983824, 'dot_extra_reward': 11, 'energy_pill_extra_reward': 75}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:38,563] Trial 26 finished with value: 340.0 and parameters: {'gamma': 0.9999962717663855, 'n_layers': 2, 'h_size': 329, 'dropout': 0.06399755120130299, 'lr': 7.442700511327361e-05, 'longevity_exponential': 1.0040731706591077, 'step_penalty_multiplier': 1.0304364800076233, 'dot_extra_reward': 4, 'energy_pill_extra_reward': 86}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:39,403] Trial 27 pruned. 
[I 2023-12-11 11:17:40,249] Trial 28 pruned. 
[I 2023-12-11 11:17:57,110] Trial 29 finished with value: 300.0 and parameters: {'gamma': 0.9999957696202204, 'n_layers': 2, 'h_size': 211, 'dropout': 0.1351672126242052, 'lr': 0.00029714725045713083, 'longevity_exponential': 1.0050929060501523, 'step_penalty_multiplier': 1.0660298363608978, 'dot_extra_reward': 0, 'energy_pill_extra_reward': 83}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:58,133] Trial 30 pruned. 
[I 2023-12-11 11:17:59,355] Trial 31 pruned. 
[I 2023-12-11 11:18:00,135] Trial 32 pruned. 
[I 2023-12-11 11:18:22,801] Trial 33 finished with value: 380.0 and parameters: {'gamma': 0.9999921682544429, 'n_layers': 3, 'h_size': 194, 'dropout': 0.1358679172202715, 'lr': 5.434784683786154e-05, 'longevity_exponential': 1.0028078106929046, 'step_penalty_multiplier': 1.0530090630882818, 'dot_extra_reward': 6, 'energy_pill_extra_reward': 94}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:18:43,621] Trial 34 finished with value: 330.0 and parameters: {'gamma': 0.9999911743807219, 'n_layers': 3, 'h_size': 345, 'dropout': 0.06748343572692499, 'lr': 5.424840934115718e-05, 'longevity_exponential': 1.00280734024581, 'step_penalty_multiplier': 1.051590334980659, 'dot_extra_reward': 6, 'energy_pill_extra_reward': 91}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:18:44,687] Trial 35 pruned. 
[I 2023-12-11 11:18:45,810] Trial 36 pruned. 
[I 2023-12-11 11:18:46,971] Trial 37 pruned. 
[I 2023-12-11 11:18:48,075] Trial 38 pruned. 
[I 2023-12-11 11:18:48,772] Trial 39 pruned. 
[I 2023-12-11 11:18:49,644] Trial 40 pruned. 
[I 2023-12-11 11:18:50,877] Trial 41 pruned. 
[I 2023-12-11 11:18:52,173] Trial 42 pruned. 
[I 2023-12-11 11:19:14,969] Trial 43 finished with value: 320.0 and parameters: {'gamma': 0.9999875621704095, 'n_layers': 3, 'h_size': 326, 'dropout': 0.087581568152122, 'lr': 0.00010853859356981085, 'longevity_exponential': 1.0023722116305447, 'step_penalty_multiplier': 1.0312161519450713, 'dot_extra_reward': 8, 'energy_pill_extra_reward': 91}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:19:16,025] Trial 44 pruned. 
[I 2023-12-11 11:19:17,050] Trial 45 pruned. 
[I 2023-12-11 11:19:17,909] Trial 46 pruned. 
[I 2023-12-11 11:19:19,003] Trial 47 pruned. 
[I 2023-12-11 11:19:20,037] Trial 48 pruned. 
[I 2023-12-11 11:19:20,901] Trial 49 pruned. 
In [12]:
# Recommended hyperparameters from Optuna study
# Exact code from Optuna simple example
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value:  ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
Study statistics: 
  Number of finished trials:  50
  Number of pruned trials:  34
  Number of complete trials:  16
Best trial:
  Value:   650.0
  Params: 
    gamma: 0.9999931906352822
    n_layers: 3
    h_size: 169
    dropout: 0.006649481779098021
    lr: 0.00010041720505743183
    longevity_exponential: 1.0030306466772165
    step_penalty_multiplier: 1.0403995699975574
    dot_extra_reward: 9
    energy_pill_extra_reward: 73
In [13]:
# Importance evaluation for each hyperparameter from Optuna study
optuna.visualization.plot_param_importances(study)

# print("Importances:")
# for key, value in optuna.importance.get_param_importances(study).items():
#   print(key, ":", value)